This code is assembled to perform the steps to summarize differentially expressed genes and clusters of cells. The processed data were downloaded and saved in SEUSS_processed_data.
This cade has been pulled from https://github.com/yanwu2014/SEUSS-Analysis and specifically copied from hESC_summarize_results.R
The data for cells cultured in the pluripotent stem cell medium has been put into a compressed file in the Google Drive. You should be able to download it from the following link: https://drive.google.com/open?id=19qHsRFO4QwHhotxuw73MYRHB8CFYPM3c and extract the files onto your local computer. You will also need to change the file.handle appropriately (see below).
The following packages must be installed prior to running (many needed only to load swne package. Uncomment to execute.
getPackageIfNeeded <- function(pkg) {
if (!require(pkg, character.only=TRUE, quietly =TRUE))
install.packages(pkgs=pkg, dependencies=TRUE)
}
getBiocPackageIfNeeded <- function(pkg, vers = NULL) {
if (!require(pkg, character.only=TRUE, quietly =TRUE))
BiocManager::install(pkgs=pkg, version=vers)
}
bcpkgs <- list(list(pkg="BiocManager",vers="3.10"),
list(pkg="TxDb.Hsapiens.UCSC.hg38.knownGene",vers="3.10"),
list(pkg="org.Hs.eg.db"),
list(pkg="GO.db"),
list(pkg="qvalue", vers="3.10")
)
invisible(lapply(bcpkgs, function(x)
do.call(getBiocPackageIfNeeded, args=x)))
pkgs <- c("devtools","uwot","umap","Seurat")
invisible(sapply(pkgs,getPackageIfNeeded))
if (!require("NNLM"))
devtools::install_version("NNLM", version = "0.4.3")
if (!require("swne"))
devtools::install_github("yanwu2014/swne")
if (!require("perturbLM"))
devtools::install_github("yanwu2014/perturbLM", upgrade = "never")
Set the file.handle parameter to the name of the sample to be analyzed (this can include path information if data not in the same directory as this file). Generate a summary table for each sample, as well as cluster enrichment heatmaps, tSNE plots, and differential expression barplots.
The hESC_run_regression.R and hESC_run_clustering.R must have been run on the sample first. (This has already been done by Parekh et al.)
Load all required libraries.
#### Create t-SNE plots, differential expression heatmaps, and cluster enrichment heatmaps
library(perturbLM)
library(swne)
library(ggplot2)
library(RColorBrewer)
library(Seurat)
library(grid)
library(gtable)
Define file locations and load data. Example below includes path information for my computer. Must be modified to work appropriately on your computer.
## Name of the sample to be analyzed
file.handle <- "~/Downloads/Parekh_paper_data/up-tf-stem"
#file.handle <- "up-tf-stem"
# file.handle <- "up-tf-endo"
# file.handle <- "up-tf-multi"
# file.handle <- "up-tf-klf"
# file.handle <- "up-tf-myc"
# file.handle <- "up-tf-neuron-nohygro"
## Load regression results
coefs.df <- read.table(paste(file.handle, "regression.pvals.tsv", sep = "_"), sep = "\t",
header = T, stringsAsFactors = F)
top.coefs.df <- subset(coefs.df, FDR < 0.05 & abs(cf) > 0.025)
print(sort(table(top.coefs.df$Group), decreasing = T))
KLF4 CDX2 SNAI2 ONECUT1
487 272 226 132
MYOG ASCL1 NEUROD1 MYOD1
122 121 112 101
TBX5 MYC HNF1A RUNX1
55 50 47 45
NEUROG3 ASCL4 MESP1 NEUROG1
44 39 36 35
LHX3 ASCL3 OTX2 SPI1
33 31 30 28
TFAP2C SOX2 PAX7 FOXA3
27 23 22 20
ASCL5 CRX HAND2 HOXA1
19 18 18 18
SPIC SOX3 FOXA2 SOX10
17 16 15 15
HOXA11 POU1F1 GATA4 SIX2
14 13 12 11
ETV2 ESRRG GATA2 HOXB6
9 6 6 5
SPIB GATA1 MEF2C MITF
4 3 3 3
SIX1 ERG HOXA10 LMX1A
3 2 2 2
mCherry-int-ctrl MYCL SRY ATF7
2 2 2 1
FOXP1 NRL
1 1
## Load clustering
r <- readRDS(paste(file.handle, "clustering.Robj", sep = "_"))
clusters <- r@ident; names(clusters) <- r@cell.names;
levels(clusters) <- paste("C", levels(clusters), sep = "")
clusters.list <- UnflattenGroups(clusters)
# update v2 Seurat object to v3
r <- UpdateSeuratObject(r)
Updating from v2.X to v3.X
Validating object structure
Updating object slots
Ensuring keys are in the proper strucutre
Ensuring feature names don't have underscores or pipes
Object representation is consistent with the most current Seurat version
## Load genotypes
# genotypes.list <- ReadGenotypes("up-tf-klf-myc_pheno_dict.csv")
genotypes.list <- ReadGenotypes(paste(file.handle, "pheno_dict.csv", sep = "_"))
genotypes.list <- lapply(genotypes.list, function(x) x[x %in% names(clusters)])
genotypes.sizes <- sapply(genotypes.list, length)
Calculate enrichment scores and summarize results.
## Calculate genotype enrichment
genotypes.cl.tbl <- GenotypeClusterCounts(genotypes.list, clusters.list)
genotypes.cl.pvals <- GenotypeClusterPvals(genotypes.cl.tbl)
genotypes.min.p <- apply(genotypes.cl.pvals, 1, function(x) min(abs(x)) * length(x))
metap::sumlog(genotypes.min.p)
Some studies omitted
chisq = 3750.279 with df = 118 p = 0
## Summarize analysis results
genotype.counts <- sort(table(top.coefs.df$Group), decreasing = T)
genotype.counts <- genotype.counts[names(genotype.counts) %in% rownames(genotypes.cl.pvals)]
genotype.avg_cf <- tapply(top.coefs.df$cf, top.coefs.df$Group, function(x) mean(abs(x)))[names(genotype.counts)]
genotype.summary <- data.frame(n_diff_genes = as.integer(genotype.counts), avg_cf = genotype.avg_cf)
genotype.summary$min_cluster_pval <- apply(genotypes.cl.pvals[rownames(genotype.summary), ], 1, function(x) min(abs(x)))
genotype.summary$min_cluster <- apply(genotypes.cl.pvals[rownames(genotype.summary), ], 1, function(x) names(x[which.min(abs(x))]))
genotype.summary$n_cells <- genotypes.sizes[rownames(genotype.summary)]
Uncomment code below if you want to save the output data to a file.
# write.table(genotype.summary, file = paste(file.handle, "summary.tsv", sep = "_"), sep = "\t")
Continue processing to identify significant genotypes.
## Determine significant genotypes
min.cl.pval <- 1e-12
min.diff.genes <- 40
ctrl.cl <- genotype.summary["mCherry", "min_cluster"]
sig.genotypes <- rownames(subset(genotype.summary, (abs(min_cluster_pval) < min.cl.pval & min_cluster != ctrl.cl)
| n_diff_genes > min.diff.genes))
sig.genotypes <- sort(unique(c(sig.genotypes, "mCherry-int-ctrl")), decreasing = T); print(sig.genotypes);
[1] "TBX5" "SNAI2" "RUNX1"
[4] "ONECUT1" "NEUROG3" "NEUROG1"
[7] "NEUROD1" "MYOG" "MYOD1"
[10] "MYC" "mCherry-int-ctrl" "KLF4"
[13] "HNF1A" "CDX2" "ASCL4"
[16] "ASCL1"
Generate barplot graph.
## Differential expression barplot
n.diff.genes <- genotype.summary[sig.genotypes, "n_diff_genes"]
names(n.diff.genes) <- sig.genotypes
gg.bar <- ggBarplot(n.diff.genes, fill.color = "purple")
# pdf(paste(file.handle, "diff_genes_barplot.pdf", sep = "_"), width = 3.5, height = 1.5)
# print(gg.bar)
# dev.off()
## Cluster enrichment heatmap (Figure 1c-e)
max.lp <- 50
genotypes.cl.lp <- apply(genotypes.cl.pvals[sig.genotypes,], 1:2, function(x) ifelse(x > 0, -log10(x), log10(-1 * x)))
genotypes.cl.lp[genotypes.cl.lp > max.lp] <- max.lp
genotypes.cl.lp[genotypes.cl.lp < -1 * max.lp] <- -1 * max.lp
# pdf(paste(file.handle, "cl_enrich_heatmap.pdf", sep = "_"), width = 4.5, height = 5)
# ggHeat(genotypes.cl.lp, clustering = "none", x.lab.size = 14, y.lab.size = 14)
# dev.off()
gg.heat <- ggHeat(t(genotypes.cl.lp), clustering = "none", x.lab.size = 14, y.lab.size = 14)
gg.bar <- gg.bar + theme(axis.text.x = element_blank())
gg.heat <- gg.heat + theme(legend.position = "none")
gg.1 <- ggplotGrob(gg.heat)
gg.2 <- ggplotGrob(gg.bar)
gg.aligned <- rbind(gg.2, gg.1, size = "first")
gg.aligned$widths <- grid::unit.pmax(gg.1$widths, gg.2$widths)
#pdf(paste(file.handle, "stacked_diff_genes_cl_enrich_nolabels.pdf", sep = "_"), width = 5.5, height = 6)
grid::grid.newpage()
grid::grid.draw(gg.aligned)
#dev.off()
Generate tSNE plots
## tSNE plots (Figure 1c-e)
plot.seed <- 32907098
tsne.emb <- Embeddings(object = r, reduction = "tsne")
#pdf(paste(file.handle, "tsne_plot.pdf", sep = "_"), width = 4, height = 4)
PlotDims(tsne.emb, sample.groups = clusters, pt.size = 0.75, alpha.plot = 0.5, label.size = 6, do.label = T,
show.legend = F, seed = plot.seed)
#dev.off()
#pdf(paste(file.handle, "tsne_plot_nolabel.pdf", sep = "_"), width = 4, height = 4)
PlotDims(tsne.emb, sample.groups = clusters, pt.size = 0.75, alpha.plot = 0.5, label.size = 0, do.label = F,
show.legend = F, seed = plot.seed)
#dev.off()
#pdf(paste(file.handle, "tsne_plot_batch.pdf", sep = "_"), width = 4, height = 4)
batch <- factor(r@meta.data$batch)
# names(batch) <- r@cell.names
names(batch) <- colnames(r)
batch <- batch[rownames(tsne.emb)]
PlotDims(tsne.emb, sample.groups = batch, pt.size = 1, alpha.plot = 0.5, label.size = 8, do.label = T,
show.legend = F, seed = plot.seed)
#dev.off()
## tSNE plot overlay
# plot.seed <- 32907098
# tsne.emb <- Embeddings(object = r, reduction = "tsne")
TF <- "NEUROD1"
tf.groups <- as.character(clusters); names(tf.groups) <- names(clusters);
tf.groups[names(tf.groups) %in% genotypes.list[[TF]]] <- TF
tf.groups[!names(tf.groups) %in% genotypes.list[[TF]]] <- ""
tf.groups <- factor(tf.groups)
#pdf(paste0(file.handle, "_tsne_plot_", TF, ".pdf"), width = 3, height = 3)
PlotDims(tsne.emb, sample.groups = tf.groups, pt.size = 0.35, alpha.plot = 0.4, label.size = 0, do.label = T,
show.legend = F, seed = plot.seed) + scale_color_manual(values = c("grey", "red")) + ggtitle(TF)
#dev.off()